3. Les parlementaires sur le réseau social Twitter#

3.1. Analyse des données textuelles#

import pandas as pd
from lib.figures import *
from lib.constant import *
from lib.utils import *

from bokeh.io import output_notebook
output_notebook(hide_banner=True)

seed = 42

twitter_df = pd.read_parquet('data/twitter_fev_to_juin_2023_retraite_data.parquet_v2')
intervention_frequency_per_group(twitter_df)
Hide code cell source
from bokeh.models import TabPanel, Tabs

df = getCountDataframe(twitter_df,top_n=10)

tab1 = TabPanel(child=occurrenceDistributionPerGroupePolitique(df[df.num_words == 1]), title="1 mot")
tab2 = TabPanel(child=occurrenceDistributionPerGroupePolitique(df[df.num_words == 2]), title="2 mots")
tab3 = TabPanel(child=occurrenceDistributionPerGroupePolitique(df[df.num_words == 3]), title="3 mots")

show(Tabs(tabs=[tab1, tab2,tab3],sizing_mode ="stretch_width"))
from bokeh.models import TabPanel, Tabs

df = getCountDataframe(twitter_df,top_n=10)

tab1 = TabPanel(child=occurrenceDistributionPerPolitiqueOrientation(df[df.num_words == 1]), title="1 mot")
tab2 = TabPanel(child=occurrenceDistributionPerPolitiqueOrientation(df[df.num_words == 2]), title="2 mots")
tab3 = TabPanel(child=occurrenceDistributionPerPolitiqueOrientation(df[df.num_words == 3]), title="3 mots")

show(Tabs(tabs=[tab1, tab2,tab3],sizing_mode ="stretch_width"))

3.2. Network Data#

twitter_df = pd.read_parquet('data/twitter_fev_to_juin_2023_retraite_data.parquet_v2')
twitter_df = twitter_df[~(twitter_df.retweet_id == None)]
twitter_df= twitter_df[twitter_df.is_keywords]
deputy_df = pd.read_csv("data/nosdeputes.fr_deputes_en_mandat_2023-08-02.csv",sep=";")
slug2twitterat = dict(deputy_df["slug twitter".split()].values)
twitter_df["twitter_at"] = twitter_df.username.map(slug2twitterat)
twitter_df.head(2)
username full_text date in_reply_to_screen_name in_reply_to_status_id_str in_reply_to_user_id_str retweet_id retweet_username retweet_user_id is_quote_status quoted_status_id_str groupe_sigle hashtag is_hashtag lemmatization keywords_detected is_keywords twitter_at
464 jean-luc-fugit RT : La réforme des retraites soulève la quest... 2023-02-01 00:00:59+00:00 None None None 1620569649967681542 StanGuerini 1911591212 False None REN [#retraites] True rt : le réforme de retraite soulever le questi... [retraite, retrait, réforme, réforme de retrai... True Jean_LucFUGIT
453 laure-lavalette RT : . (RN) interpelle (LFI) : "On ne comprend... 2023-02-01 00:19:24+00:00 None None None 1620535420223213569 LCP 85362553 False None RN [] False rt : . ( RN ) interpelle ( LFI ) : " on ne com... [obstruction, majorité] True LaureLavalette
deputy_df["color"] = deputy_df.groupe_sigle.map(gp_politique_color)
dep2color = dict(deputy_df["twitter color".split()].values)
dep2sigle = dict(deputy_df["twitter groupe_sigle".split()].values)
def color(node):
    if not node in dep2color:
        return "#aaa"
    return dep2color[node]

def gp_legend(node):
    if not node in dep2sigle:
        return "NA"
    return dep2sigle[node]
import networkx as nx
graph_df = twitter_df["twitter_at retweet_username groupe_sigle".split()].astype(str)
graph_df = graph_df[~(graph_df.isna())]
#graph_df = graph_df[graph_df.retweet_username.isin(deputy_df.twitter.values)]
graph_df = graph_df.groupby("twitter_at retweet_username".split(),as_index=False).size()
G = nx.from_pandas_edgelist(graph_df,source="twitter_at",target="retweet_username",edge_attr="size",create_using=nx.DiGraph)
for node in list(G.nodes()):
    if G.degree(node)<4:
        G.remove_node(node)
if "None" in G: G.remove_node("None")
from ipysigma import SigmaGrid
betweeness = nx.betweenness_centrality(G)
page_rank = nx.pagerank(G)
SigmaGrid(G,hide_search=False,columns=2).add(node_size=G.in_degree,name="In Degree",
      node_color=dep2sigle,
      default_node_border_color="#ffffff",
      node_color_palette=gp_politique_color,
      node_label_size=G.degree,
      node_size_range=[3,20],
      start_layout=10, default_edge_type="curve",
      label_font="Arial",
      edge_size_range=[0.1,1])\
    .add(node_size=lambda x:betweeness[x],
      node_color=dep2sigle,
      default_node_border_color="#ffffff",
      node_color_palette=gp_politique_color,
      node_label_size=lambda x:betweeness[x],
      start_layout=10, default_edge_type="curve",
      label_font="Arial",
      edge_size_range=[1,5],node_size_range=[3,20],name="Betweeness")\
      .add(node_size=lambda x:page_rank[x],
      node_color=dep2sigle,
      default_node_border_color="#ffffff",
      node_color_palette=gp_politique_color,
      node_label_size=lambda x:page_rank[x],
      start_layout=10, default_edge_type="curve",
      label_font="Arial",
      edge_size_range=[1,5],node_size_range=[3,20],name="Page Rank")

3.2.1. Content similarity#

dataset_df = pd.read_parquet('data/twitter_fev_to_juin_2023_retraite_data.parquet_v2')
dataset_df = dataset_df.drop_duplicates("full_text")
dataset_df = dataset_df[~dataset_df.full_text.apply(lambda x : x.startswith("RT"))]
dataset_df = dataset_df[dataset_df.is_keywords]
dataset_df.head()
username full_text date in_reply_to_screen_name in_reply_to_status_id_str in_reply_to_user_id_str retweet_id retweet_username retweet_user_id is_quote_status quoted_status_id_str groupe_sigle hashtag is_hashtag lemmatization keywords_detected is_keywords
375 mathieu-lefevre Plus les impôts baissent et plus les recettes ... 2023-02-01 06:32:02+00:00 None None None None None None False None REN [] False plus le impôt baisser et plus le recette de l’... [travail, recette, courage, impôt] True
682 frederic-boccaletti Mme , "apparemment il y a une partie du foncti... 2023-02-01 06:50:58+00:00 None None None None None None False None RN [#motionreferendaire] True mme , " apparemment il y avoir un partie de fo... [pouvoir] True
488 philippe-brun Le prix de l'énergie est un élément essentiel ... 2023-02-01 07:02:50+00:00 None None None None None None False None SOC [] False le prix de le énergie être un élément essentie... [entreprise, compétitivité, patron] True
326 kevin-mauvieux Comprenez : « jamais nous ne défendrons la Fra... 2023-02-01 07:10:59+00:00 None None None None None None True 1620493775515828226 RN [#NonALaReformeDesRetraites] True Comprenez : « jamais nous ne défendre le Franc... [vote, retraite, retrait, ratio, français, Fra... True
453 gregoire-de-fournas Des centaines d'amendements de la NUPES ont ét... 2023-02-01 07:14:30+00:00 None None None None None None True 1620493775515828226 RN [] False un centaine de amendement de le NUPES avoir êt... [vote, ratio] True
from sklearn.metrics.pairwise import cosine_similarity
import gensim
from umap import UMAP
from ipysigma import Sigma
count_tweet = dict(dataset_df.groupby("username").size())
user2groupe = dict(dataset_df["username groupe_sigle".split()].values)
df_corpus = dataset_df.groupby("username",as_index=False).agg({"lemmatization":lambda x : " ".join(x)})
df_corpus["groupe_sigle"] = df_corpus["username"].map(dep2sigle)
corpus = df_corpus.apply(lambda doc: gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(doc.lemmatization), [doc.username]),axis=1)
model = gensim.models.doc2vec.Doc2Vec(vector_size=64, min_count=2, epochs=40,dm=1)
model.build_vocab(corpus)
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)
user_vec = model.dv.vectors
user_vec_reduced = UMAP(random_state=seed).fit_transform(user_vec)
# fig = px.scatter(x=user_vec_reduced[:,0],y=user_vec_reduced[:,1],color=[user2groupe[user] for user in model.dv.index_to_key],size=[count_tweet[user] for user in model.dv.index_to_key],color_discrete_map=gp_politique_color,
#            text= model.dv.index_to_key, height=1000,opacity=0.9,size_max=40)
# fig
G = nx.Graph()
for ix,node in enumerate(model.dv.index_to_key):
    G.add_node(node)
size_func = lambda x:count_tweet[x]
Sigma(graph=G,layout={node:{"x":user_vec_reduced[ix,0],"y":user_vec_reduced[ix,1]}for ix,node in enumerate(model.dv.index_to_key)},
      node_size=size_func,node_color_palette=gp_politique_color,
      node_color=user2groupe,default_node_border_color="#efefef",hide_search=True,
      node_label_size=size_func)